Reading data

In [1]:
import pyreadr
import sklearn
import pandas as pd
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import shap
import numpy as np
shap.initjs()
from lime.lime_tabular import LimeTabularExplainer


train = pyreadr.read_r('hmc_train.Rda')['train']  
valid = pyreadr.read_r('hmc_valid.Rda')['valid']

assert set(train.columns) == set(valid.columns)
column_names = sorted(train.columns) 

train = train[column_names]
valid = valid[column_names]

column_names.remove("PURCHASE")


def xy_split(data, y_name="PURCHASE"):
    return data.drop([y_name], axis=1).to_numpy(), data[y_name].to_numpy()


X_train, Y_train = xy_split(train)
X_valid, Y_valid = xy_split(valid)

Training boosting model

In [2]:
xgmodel = XGBClassifier(max_depth=13,
                          objective='binary:logistic',
                        gamma=0.1)
xgmodel.fit(X_train, Y_train, verbose=True)
xg_predictions = xgmodel.predict(X_train)
valid_score = xgmodel.score(X_valid, Y_valid)
print("xgboost valid score {}".format(valid_score))
xgboost valid score 0.8366

Training linear model

In [3]:
logmodel = LogisticRegression(max_iter=1000, solver='lbfgs', C=3)
logmodel.fit(X_train, Y_train)
log_predictions = xgmodel.predict(X_train)
log_score = logmodel.score(X_valid, Y_valid)
print("logistic valid score {}".format(log_score))
logistic valid score 0.8029
lbfgs failed to converge. Increase the number of iterations.

Dataset characteristics:

In [4]:
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 70 columns):
AGE                                10000 non-null float64
AGRGT_BAL_ALL_XCLD_MRTG            10000 non-null float64
AUTO_2_OPEN_DATE_YRS               10000 non-null float64
AUTO_HI_CRDT_2_ACTUAL              10000 non-null float64
AVG_BAL_ALL_FNC_REV_ACTS           10000 non-null float64
AVG_BAL_ALL_PRM_BC_ACTS            10000 non-null float64
D_DEPTCARD                         10000 non-null float64
D_NA_AVG_BAL_ALL_FNC_REV_ACTS      10000 non-null float64
D_NA_M_SNCOLDST_BNKINSTL_ACTOPN    10000 non-null float64
D_NA_M_SNC_MST_RCNT_ACT_OPN        10000 non-null float64
D_NA_M_SNC_MST_RCNT_MRTG_DEAL      10000 non-null float64
D_NA_M_SNC_OLDST_MRTG_ACT_OPN      10000 non-null float64
D_NA_RATIO_PRSNL_FNC_BAL2HICRDT    10000 non-null float64
D_REGION_A                         10000 non-null float64
D_REGION_B                         10000 non-null float64
D_REGION_C                         10000 non-null float64
FNC_CARD_OPEN_DATE_YRS             10000 non-null float64
HI_RETAIL_CRDT_LMT                 10000 non-null float64
MAX_MRTG_CLOSE_DATE                10000 non-null float64
MRTG_1_MONTHLY_PAYMENT             10000 non-null float64
MRTG_2_CURRENT_BAL                 10000 non-null float64
M_SNCOLDST_BNKINSTL_ACTOPN         10000 non-null float64
M_SNCOLDST_OIL_NTN_TRD_OPN         10000 non-null float64
M_SNC_MSTRCNT_MRTG_ACT_UPD         10000 non-null float64
M_SNC_MSTREC_INSTL_TRD_OPN         10000 non-null float64
M_SNC_MST_RCNT_60_DAY_RTNG         10000 non-null float64
M_SNC_MST_RCNT_ACT_OPN             10000 non-null float64
M_SNC_MST_RCNT_MRTG_DEAL           10000 non-null float64
M_SNC_OLDST_MRTG_ACT_OPN           10000 non-null float64
M_SNC_OLDST_RETAIL_ACT_OPN         10000 non-null float64
N30D_ORWRS_RTNG_MRTG_ACTS          10000 non-null float64
N_120D_RATINGS                     10000 non-null float64
N_30D_AND_60D_RATINGS              10000 non-null float64
N_30D_RATINGS                      10000 non-null float64
N_ACTS_90D_PLS_LTE_IN_6M           10000 non-null float64
N_ACTS_WITH_MXD_3_IN_24M           10000 non-null float64
N_ACTS_WITH_MXD_4_IN_24M           10000 non-null float64
N_BANK_INSTLACTS                   10000 non-null float64
N_BC_ACTS_OPN_IN_12M               10000 non-null float64
N_BC_ACTS_OPN_IN_24M               10000 non-null float64
N_DEROG_PUB_RECS                   10000 non-null float64
N_DISPUTED_ACTS                    10000 non-null float64
N_FNC_ACTS_OPN_IN_12M              10000 non-null float64
N_FNC_ACTS_VRFY_IN_12M             10000 non-null float64
N_FNC_INSTLACTS                    10000 non-null float64
N_INQUIRIES                        10000 non-null float64
N_OF_MRTG_ACTS_DLINQ_24M           10000 non-null float64
N_OF_SATISFY_FNC_REV_ACTS          10000 non-null float64
N_OPEN_REV_ACTS                    10000 non-null float64
N_PUB_REC_ACT_LINE_DEROGS          10000 non-null float64
N_RETAIL_ACTS_OPN_IN_24M           10000 non-null float64
N_SATISFY_INSTL_ACTS               10000 non-null float64
N_SATISFY_OIL_NATIONL_ACTS         10000 non-null float64
N_SATISFY_PRSNL_FNC_ACTS           10000 non-null float64
PRCNT_OF_ACTS_NEVER_DLQNT          10000 non-null float64
PREM_BANKCARD_CRED_LMT             10000 non-null float64
PURCHASE                           10000 non-null float64
RATIO_BAL_TO_HI_CRDT               10000 non-null float64
RATIO_PRSNL_FNC_BAL2HICRDT         10000 non-null float64
RATIO_RETAIL_BAL2HI_CRDT           10000 non-null float64
STUDENT_HI_CRED_RANGE              10000 non-null float64
STUDENT_OPEN_DATE_YRS              10000 non-null float64
TOT_BAL_ALL_DPT_STORE_ACTS         10000 non-null float64
TOT_HI_CRDT_CRDT_LMT               10000 non-null float64
TOT_INSTL_HI_CRDT_CRDT_LMT         10000 non-null float64
TOT_NOW_LTE                        10000 non-null float64
TOT_OTHRFIN_HICRDT_CRDTLMT         10000 non-null float64
TREATMENT                          10000 non-null float64
UNIQUE_ID                          10000 non-null float64
UPSCALE_OPEN_DATE_YRS              10000 non-null float64
dtypes: float64(70)
memory usage: 5.3 MB
In [5]:
print("Dataset bias: {}".format(np.average(Y_train)))
Dataset bias: 0.1996

This is an uplift modelling dataset (package Information from R), with indicatior variable "TREATMENT" as a column (prepared for single-model approach) Data is highly biased, so above scores dont seem to be good (logistic regression have score close to answearing always no). Other thing is, column names lack explanation itself, so i will try to focus on columns which names are clear to understand.

In [6]:
explainer = LimeTabularExplainer(X_train, feature_names=column_names, 
        class_names=['NOT', 'BUY'], discretize_continuous=True)
In [7]:
def lime_explainer(X, Y, model, features=3):
    model_explainer = explainer.explain_instance(X, model.predict_proba, num_features=features)
    model_explainer.show_in_notebook(show_table=True, show_all=False)
    proba = model.predict_proba([X])[0][1]
    print("Purchase probability {}, Right answear {}\n".format(proba, "PURCHASE" if Y else "NO PURCHASE"))
    explanations = model_explainer.as_list()
    positive = sorted([(round(imp, 2), text) for text, imp in explanations if imp > 0], reverse=True)
    negative = sorted([(-round(imp, 2), text) for text, imp in explanations if imp < 0], reverse=True)
    positive = pd.DataFrame.from_records(positive, columns = ['Importance', 'Explanation']) 
    negative = pd.DataFrame.from_records(negative, columns = ['Importance', 'Explanation']) 
    print("Parameters in favor of PURCHASE:")
    display(positive)
    print("Parameters in favor of NO PURCHASE:")
    display(negative)

Observation 11 explanation

XGBoost model

In [8]:
index = 11
In [9]:
lime_explainer(X_valid[index], Y_valid[index], xgmodel, features=5)
Purchase probability 0.7460730671882629, Right answear PURCHASE

Parameters in favor of PURCHASE:
Importance Explanation
0 0.18 N_OPEN_REV_ACTS > 7.00
1 0.08 STUDENT_HI_CRED_RANGE <= 0.00
2 0.08 D_REGION_A <= 0.00
3 0.05 PREM_BANKCARD_CRED_LMT <= 0.00
Parameters in favor of NO PURCHASE:
Importance Explanation
0 0.12 MRTG_1_MONTHLY_PAYMENT > 414.00

This is the rare case of model predicting right answear of PURCHASE, taking into consideration structure of the dataset. Explanations however show, that this is an eazy task. Only one parameter is driving predition towards wrong answear "monthly mortage payment" with value of 414, probably meaning that client might be overloaded with payments, and not willing to invest in another one. But other parameters tipping the balance to PURCHASE option show, that probably client is wealthy (more then 7 open open revolving accounts, and have no limit on a premium credit card), so he can afford more expenses. Also, region A is probably poor, so people outside of it are more likely to buy.

Linear model

In [10]:
lime_explainer(X_valid[index], Y_valid[index], logmodel, features=5)
Purchase probability 0.4315413228366079, Right answear PURCHASE

Parameters in favor of PURCHASE:
Importance Explanation
0 0.13 TOT_HI_CRDT_CRDT_LMT > 110366.25
1 0.11 HI_RETAIL_CRDT_LMT > 1300.00
2 0.08 M_SNC_OLDST_RETAIL_ACT_OPN > 164.24
Parameters in favor of NO PURCHASE:
Importance Explanation
0 0.15 MRTG_1_MONTHLY_PAYMENT > 414.00
1 0.09 AVG_BAL_ALL_FNC_REV_ACTS > 1767.20

Linear model, suprisingly is close to predicting a correct answear. Same parameter suggests "NO PURCHASE" connected to high mortage payment, but is also counterweighted in a more linear manner with parameters linked to wealth of this client (high credit card limits).

Observation 2 explanation

XGBoost model

In [11]:
index = 2
In [12]:
lime_explainer(X_valid[index], Y_valid[index], xgmodel, features=5)
Purchase probability 0.0037495701108127832, Right answear NO PURCHASE

Parameters in favor of PURCHASE:
Importance Explanation
0 0.10 MRTG_1_MONTHLY_PAYMENT <= 0.00
1 0.08 STUDENT_HI_CRED_RANGE <= 0.00
2 0.08 D_REGION_A <= 0.00
Parameters in favor of NO PURCHASE:
Importance Explanation
0 0.09 N_OPEN_REV_ACTS <= 0.00
1 0.06 TOT_HI_CRDT_CRDT_LMT <= 0.00

In this case, gradient boosting model is sure that this is a clear "not buying" case. We can see its non-linearity, because here no mortage payment (indicated by value zero) slightly suggests in favor of buying, instead of not buying, like in the previous case. But it is overweighted by no open revolving accounts (and in this case its limit equal to zero). Also here we can see, that 5 parameters cannot describe how this model works.

Linear model

In [13]:
lime_explainer(X_valid[index], Y_valid[index], logmodel, features=5)
Purchase probability 0.17391004773125385, Right answear NO PURCHASE

Parameters in favor of PURCHASE:
Importance Explanation
0 0.11 MRTG_1_MONTHLY_PAYMENT <= 0.00
1 0.09 AVG_BAL_ALL_FNC_REV_ACTS <= 1767.20
2 0.07 MRTG_2_CURRENT_BAL <= 0.00
Parameters in favor of NO PURCHASE:
Importance Explanation
0 0.08 HI_RETAIL_CRDT_LMT <= 0.00
1 0.05 TOT_HI_CRDT_CRDT_LMT <= 0.00

As we see, linear model cannot undestand non-linear relation with the mortage monthly payment, so here also it is driving predition towards "NO PURCHASE" (conversely to the previous model). This answear however is still "NO" because classifier is fitted to answear always so, because it couldnt understand the data.

Search to find false negatives

Clients who we predicted to buy, but they didnt.

In [14]:
np.where(np.logical_and((xgmodel.predict(X_valid) != Y_valid), Y_valid == 0))
Out[14]:
(array([   9,   42,   52,   76,   80,   90,  103,  137,  173,  183,  278,
         297,  301,  340,  357,  358,  376,  381,  385,  390,  398,  463,
         467,  500,  507,  516,  539,  588,  589,  599,  656,  711,  713,
         740,  758,  767,  789,  793,  895,  926,  944,  963, 1002, 1013,
        1022, 1048, 1091, 1120, 1136, 1143, 1163, 1205, 1213, 1267, 1272,
        1305, 1309, 1357, 1363, 1387, 1439, 1465, 1466, 1525, 1535, 1542,
        1547, 1620, 1631, 1668, 1763, 1775, 1812, 1813, 1831, 1844, 1860,
        1917, 1955, 2022, 2155, 2217, 2280, 2310, 2335, 2336, 2338, 2400,
        2410, 2419, 2456, 2466, 2545, 2581, 2597, 2625, 2633, 2641, 2675,
        2679, 2712, 2722, 2749, 2771, 2773, 2812, 2855, 2915, 2926, 2930,
        2974, 2982, 3017, 3062, 3135, 3139, 3195, 3202, 3228, 3255, 3256,
        3286, 3312, 3314, 3325, 3354, 3355, 3495, 3520, 3538, 3582, 3605,
        3607, 3626, 3649, 3655, 3656, 3658, 3663, 3702, 3705, 3736, 3749,
        3767, 3830, 3832, 3923, 3932, 3979, 4025, 4056, 4085, 4091, 4098,
        4117, 4183, 4184, 4408, 4441, 4453, 4499, 4534, 4546, 4559, 4594,
        4631, 4635, 4690, 4704, 4811, 4882, 4930, 4931, 4952, 4974, 5109,
        5133, 5174, 5253, 5287, 5370, 5380, 5388, 5487, 5529, 5584, 5597,
        5603, 5624, 5633, 5711, 5757, 5763, 5807, 5851, 5877, 5976, 6025,
        6048, 6055, 6068, 6115, 6141, 6150, 6202, 6251, 6261, 6263, 6298,
        6325, 6367, 6393, 6412, 6422, 6423, 6432, 6537, 6544, 6549, 6706,
        6742, 6765, 6833, 6842, 6856, 6868, 6888, 6919, 6925, 7000, 7007,
        7063, 7073, 7183, 7185, 7230, 7233, 7278, 7293, 7306, 7319, 7323,
        7335, 7343, 7376, 7425, 7534, 7576, 7577, 7630, 7727, 7772, 7776,
        7780, 7809, 7818, 7833, 7907, 7940, 7957, 7969, 7970, 7983, 8040,
        8063, 8072, 8094, 8140, 8160, 8193, 8247, 8254, 8308, 8318, 8346,
        8366, 8434, 8444, 8447, 8458, 8478, 8599, 8679, 8712, 8725, 8774,
        8775, 8841, 8895, 8917, 8943, 8949, 8978, 9015, 9030, 9136, 9151,
        9195, 9200, 9216, 9251, 9320, 9323, 9364, 9367, 9382, 9583, 9604,
        9654, 9684, 9734, 9761, 9781, 9800, 9823, 9830, 9839, 9871, 9884,
        9981]),)

Observation 9 explanation

XGBoost model

In [15]:
index = 9
In [16]:
lime_explainer(X_valid[index], Y_valid[index], xgmodel, features=5)
Purchase probability 0.6327625513076782, Right answear NO PURCHASE

Parameters in favor of PURCHASE:
Importance Explanation
0 0.09 STUDENT_HI_CRED_RANGE <= 0.00
1 0.08 D_REGION_A <= 0.00
2 0.05 PREM_BANKCARD_CRED_LMT <= 0.00
Parameters in favor of NO PURCHASE:
Importance Explanation
0 0.12 N_DISPUTED_ACTS <= 0.00
1 0.08 0.00 < N_OPEN_REV_ACTS <= 3.00

This is a very rare case of false-negative prediction by gradient boosting model (considering that positive answears make up 80% of the dataset). As previously, small number of open revolving accouts together with disputed accounts is considered a predisposition not to purchase (but interesting is, why it has to be positive?). But bigger impact here is from having no student account and not living in a poor region, together with no premium credit card (which is probably mistaken for having no limit). Colncluding, this prediction does not seem reasonable, because above odds for buying are quite common (unlike buyin itself).

Linear model

In [17]:
lime_explainer(X_valid[index], Y_valid[index], logmodel, features=5)
Purchase probability 0.35523265892648087, Right answear NO PURCHASE

Parameters in favor of PURCHASE:
Importance Explanation
0 0.10 M_SNCOLDST_OIL_NTN_TRD_OPN <= 186.51
1 0.08 M_SNC_OLDST_RETAIL_ACT_OPN > 164.24
2 0.06 MRTG_2_CURRENT_BAL <= 0.00
Parameters in favor of NO PURCHASE:
Importance Explanation
0 0.07 HI_RETAIL_CRDT_LMT <= 0.00
1 0.05 M_SNC_MST_RCNT_ACT_OPN > 29.88

Linear model of course do not have this false negative, because of answearing mostly always no. Interesting is, that the parameters in favor of buying here are different then in gradient boosting model, and considering how it works, model is still not sure about the answear.

In [ ]: